{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import OutlierMahalanobis\n",
    "reload(OutlierMahalanobis)\n",
    "from OutlierMahalanobis import OutlierMahalanobis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n_batch = 1000\n",
    "p = 200\n",
    "\n",
    "tmp_mat = np.random.normal(0,1,size=(p,p))\n",
    "real_cov = np.matmul(tmp_mat.transpose(),tmp_mat)\n",
    "real_mean = np.random.normal(0,5,size=p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "om_full = OutlierMahalanobis(n_components = 10)\n",
    "om = OutlierMahalanobis(n_components = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "features = np.random.multivariate_normal(real_mean,real_cov,size=n_batch)\n",
    "feature_names = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "oscores = om.score(features,feature_names)\n",
    "oscores_full = om_full.score(features,feature_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "plt.plot(oscores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.plot(oscores_full)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "np.min(oscores_full)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test 1: 2 distributions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will generate features coming from a distribution A 99% of the time and B 1% of the time. Points coming from distribution B are the outliers."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2 dimensional case"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For visualisation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n_batch = 200\n",
    "p = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tmp_mat = np.random.normal(0,1,size=(p,p))\n",
    "real_cov_A = np.matmul(tmp_mat.transpose(),tmp_mat)\n",
    "real_mean_A = np.random.normal(0,2,size=p)\n",
    "\n",
    "tmp_mat = np.random.normal(0,1,size=(p,p))\n",
    "real_cov_B = np.matmul(tmp_mat.transpose(),tmp_mat)\n",
    "real_mean_B = np.random.normal(-6,10,size=p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "real_mean_A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "real_cov_A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "features_A = np.random.multivariate_normal(real_mean_A,real_cov_A,size=n_batch)\n",
    "features_B = np.random.multivariate_normal(real_mean_B, real_cov_B, size=n_batch)\n",
    "\n",
    "plt.scatter(*features_A.transpose())\n",
    "plt.scatter(*features_B.transpose())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "is_outlier = np.random.uniform(0,1,size=n_batch)>0.99\n",
    "\n",
    "print is_outlier.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "features = features_A*(1-is_outlier[:,None]) + features_B*is_outlier[:,None]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.scatter(*features.transpose(),c=is_outlier,cmap=\"Accent\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "om = OutlierMahalanobis()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "oscores = om.score(features,[])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.scatter(np.arange(n_batch),oscores,c=is_outlier,cmap=\"Accent\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Multi dimensional case (p=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n_batch = 200\n",
    "p = 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tmp_mat = np.random.normal(0,1,size=(p,p))\n",
    "real_cov_A = np.matmul(tmp_mat.transpose(),tmp_mat)\n",
    "real_mean_A = np.random.normal(0,2,size=p)\n",
    "\n",
    "tmp_mat = np.random.normal(0,1,size=(p,p))\n",
    "real_cov_B = np.matmul(tmp_mat.transpose(),tmp_mat)\n",
    "real_mean_B = np.random.normal(-6,10,size=p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "features_A = np.random.multivariate_normal(real_mean_A,real_cov_A,size=n_batch)\n",
    "features_B = np.random.multivariate_normal(real_mean_B, real_cov_B, size=n_batch)\n",
    "\n",
    "plt.scatter(*features_A[:,:2].transpose())\n",
    "plt.scatter(*features_B[:,:2].transpose())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "is_outlier = np.random.uniform(0,1,size=n_batch)>0.99\n",
    "\n",
    "print is_outlier.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "features = features_A*(1-is_outlier[:,None]) + features_B*is_outlier[:,None]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.scatter(*features[:,:2].transpose(),c=is_outlier,cmap=\"Accent\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "om = OutlierMahalanobis()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "oscores = om.score(features,[])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.scatter(np.arange(n_batch),oscores,c=is_outlier,cmap=\"Accent\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "om_full = OutlierMahalanobis(n_components=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "oscores_full = om_full.score(features,[])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.scatter(np.arange(n_batch),oscores_full,c=is_outlier,cmap=\"Accent\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### High dimensional case (p=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n_batch = 500\n",
    "p = 200"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tmp_mat = np.random.normal(0,0.1,size=(p,p))\n",
    "real_cov_A = np.matmul(tmp_mat.transpose(),tmp_mat)\n",
    "real_mean_A = np.random.normal(0,2,size=p)\n",
    "\n",
    "tmp_mat = np.random.normal(0,1,size=(p,p))\n",
    "real_cov_B = np.matmul(tmp_mat.transpose(),tmp_mat)\n",
    "real_mean_B = np.random.normal(-6,10,size=p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "features_A = np.random.multivariate_normal(real_mean_A,real_cov_A,size=n_batch)\n",
    "features_B = np.random.multivariate_normal(real_mean_B, real_cov_B, size=n_batch)\n",
    "\n",
    "plt.scatter(*features_A[:,:2].transpose())\n",
    "plt.scatter(*features_B[:,:2].transpose())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "is_outlier = np.random.uniform(0,1,size=n_batch)>0.99\n",
    "\n",
    "print is_outlier.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "features = features_A*(1-is_outlier[:,None]) + features_B*is_outlier[:,None]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.scatter(*features[:,:2].transpose(),c=is_outlier,cmap=\"Accent\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "om = OutlierMahalanobis()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "oscores = om.score(features,[])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.scatter(np.arange(n_batch),oscores,c=is_outlier,cmap=\"Accent\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "om_full = OutlierMahalanobis(n_components=p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "oscores_full = om_full.score(features,[])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.scatter(np.arange(n_batch),oscores_full,c=is_outlier,cmap=\"Accent\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MNIST Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from tensorflow.examples.tutorials.mnist import input_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "mnist = input_data.read_data_sets(\"MNIST_data/\", one_hot=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "all_features = mnist.train.images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n_batch = 1000\n",
    "p = 784"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "features_A = all_features[:n_batch]\n",
    "features_B = np.random.uniform(0,1,size=(n_batch,p))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "is_outlier = np.random.uniform(0,1,size=n_batch)>0.99\n",
    "\n",
    "print is_outlier.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "features = features_A*(1-is_outlier[:,None]) + features_B*is_outlier[:,None]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "om = OutlierMahalanobis(n_components=9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "oscores = om.score(features,[])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.scatter(np.arange(n_batch-100),oscores[100:],c=is_outlier[100:],cmap=\"Accent\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
